/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.wikipedia;

import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.tests.analysis.BaseTokenStreamFactoryTestCase;

/** Simple tests to ensure the wikipedia tokenizer is working. */
public class TestWikipediaTokenizerFactory extends BaseTokenStreamFactoryTestCase {

  private final String WIKIPEDIA = "Wikipedia";
  private final String TOKEN_OUTPUT = "tokenOutput";
  private final String UNTOKENIZED_TYPES = "untokenizedTypes";

  public void testTokenizer() throws Exception {
    String text = "This is a [[Category:foo]]";
    Tokenizer tf = tokenizerFactory(WIKIPEDIA).create(newAttributeFactory());
    tf.setReader(new StringReader(text));
    assertTokenStreamContents(
        tf,
        new String[] {"This", "is", "a", "foo"},
        new int[] {0, 5, 8, 21},
        new int[] {4, 7, 9, 24},
        new String[] {"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", WikipediaTokenizer.CATEGORY},
        new int[] {
          1, 1, 1, 1,
        },
        text.length());
  }

  public void testTokenizerTokensOnly() throws Exception {
    String text = "This is a [[Category:foo]]";
    Tokenizer tf =
        tokenizerFactory(WIKIPEDIA, TOKEN_OUTPUT, Integer.toString(WikipediaTokenizer.TOKENS_ONLY))
            .create(newAttributeFactory());
    tf.setReader(new StringReader(text));
    assertTokenStreamContents(
        tf,
        new String[] {"This", "is", "a", "foo"},
        new int[] {0, 5, 8, 21},
        new int[] {4, 7, 9, 24},
        new String[] {"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", WikipediaTokenizer.CATEGORY},
        new int[] {
          1, 1, 1, 1,
        },
        text.length());
  }

  public void testTokenizerUntokenizedOnly() throws Exception {
    String test =
        "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    Tokenizer tf =
        tokenizerFactory(
                WIKIPEDIA,
                TOKEN_OUTPUT,
                Integer.toString(WikipediaTokenizer.UNTOKENIZED_ONLY),
                UNTOKENIZED_TYPES,
                WikipediaTokenizer.CATEGORY + ", " + WikipediaTokenizer.ITALICS)
            .create(newAttributeFactory());
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(
        tf,
        new String[] {
          "a b c d", "e f g", "link", "here", "link",
          "there", "italics here", "something", "more italics", "h   i   j"
        },
        new int[] {11, 32, 42, 47, 56, 61, 71, 86, 98, 124},
        new int[] {18, 37, 46, 51, 60, 66, 83, 95, 110, 133},
        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
  }

  public void testTokenizerBoth() throws Exception {
    String test =
        "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    Tokenizer tf =
        tokenizerFactory(
                WIKIPEDIA,
                TOKEN_OUTPUT,
                Integer.toString(WikipediaTokenizer.BOTH),
                UNTOKENIZED_TYPES,
                WikipediaTokenizer.CATEGORY + ", " + WikipediaTokenizer.ITALICS)
            .create(newAttributeFactory());
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(
        tf,
        new String[] {
          "a b c d",
          "a",
          "b",
          "c",
          "d",
          "e f g",
          "e",
          "f",
          "g",
          "link",
          "here",
          "link",
          "there",
          "italics here",
          "italics",
          "here",
          "something",
          "more italics",
          "more",
          "italics",
          "h   i   j",
          "h",
          "i",
          "j"
        },
        new int[] {
          11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124,
          128, 132
        },
        new int[] {
          18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133,
          125, 129, 133
        },
        new int[] {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1});
  }

  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    IllegalArgumentException expected =
        expectThrows(
            IllegalArgumentException.class,
            () ->
                tokenizerFactory(WIKIPEDIA, "bogusArg", "bogusValue")
                    .create(newAttributeFactory()));
    assertTrue(expected.getMessage().contains("Unknown parameters"));
  }

  public void testIllegalArguments() throws Exception {
    IllegalArgumentException expected =
        expectThrows(
            IllegalArgumentException.class,
            () -> tokenizerFactory(WIKIPEDIA, TOKEN_OUTPUT, "-1").create(newAttributeFactory()));
    assertTrue(
        expected
            .getMessage()
            .contains("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH"));
  }
}
